# Code to replicate figures 1-3

# Libraries & Data
library(tidyverse)
library(thematic)
library(gridExtra)
library(quanteda)

options(stringsAsFactors = F)
load("dfm_ecb.rdata")
load(file = "ecb_corpus.rdata")

ecb_corpus <- ecb_corpus[-which(grepl("press conf",substr(ecb_corpus$text,0,100))),] # Exclude Press conference

# Uncomment, then run rest of code, to produce figures 1-3
b <- c("structural_policy","structural_reform") # Fig 1: Structural Reform
#b <- c("unit_labour","unit_labor") # Fig 2: Unit labour costs
#b <- c("legitimacy") # Fig 3: Structural Reform

CBs <- c("ECB")
Tabelle <- data.frame(Year = 0,Words="",value=0)
Tabelle <- Tabelle[0,]

# Compute Term frequency of specific vocabulary
for(CB in CBs){
  Tabelle_reduced <- as.data.frame(ecb_corpus[0,])
  test <- data.frame(Words = character(0), value = integer(0))
  Tabelle_reduced <- ecb_corpus[ecb_corpus$Central_Bank %in% CB,]
  DFM2 <- dfm_keep(dfm_ecb,pattern=b,valuetype = "fixed")
  DFM2 <- convert(DFM2,to="data.frame")
  if(ncol(DFM2)>2){
    TimeSeries <- rowSums(DFM2[,-1])
  }else{
    TimeSeries <- DFM2[,-1]
  }
  Tabelle_reduced <- data.frame(cbind(Year = as.integer(Tabelle_reduced[,"Year"]),Words = as.integer(Tabelle_reduced[,"Words"]),Vocabulary = as.integer(TimeSeries)))
  a <- Tabelle_reduced[,c("Year","Words")]
  Tabelle_reduced <- Tabelle_reduced[,-1]/unlist(Tabelle_reduced$Words)
  Tabelle_reduced$Year <- a$Year
  Tabelle_reduced$Words <- a$Words
  Tabelle_reduced <- cbind(Tabelle_reduced, Dokumente = rep(1,nrow(Tabelle_reduced)))
  Tabelle_reduced <- Tabelle_reduced %>% group_by(Year) %>%  summarize_all(sum)
  a <- Tabelle_reduced[,c("Year","Dokumente")]
  Tabelle_reduced <- Tabelle_reduced[,-1]/unlist(Tabelle_reduced$Dokumente)
  Tabelle_reduced$Year <- a$Year
  Tabelle_reduced$Dokumente <- a$Dokumente
  Tabelle_reduced <- gather(data = Tabelle_reduced, key = Words, value = value, -c(Year))
  Tabelle_reduced <- Tabelle_reduced[!Tabelle_reduced$Words %in% c("Dokumente","Words"),]
  Tabelle_reduced$Words <- gsub("Vocabulary",CB,Tabelle_reduced$Words)
  Tabelle <- rbind(Tabelle,Tabelle_reduced)
}
Tabelle <- Tabelle[Tabelle$Year>1998 & Tabelle$Year < 2020,]

colnames(Tabelle)[3] <- "Relative term frequency"
Tabelle <- Tabelle[,c("Year","Relative term frequency")]
Tabelle$`Relative term frequency` <- Tabelle$`Relative term frequency`*100

# Compute Proportion of Speeches which contain specific vocabulary

CBs <- c("ECB")
Tabelle_per_speech <- data.frame(Year = 0,Words="",value=0)
Tabelle_per_speech <- Tabelle_per_speech[0,]

for(CB in CBs){
  Tabelle_per_speech_reduced <- as.data.frame(ecb_corpus[0,])
  test <- data.frame(Words = character(0), value = integer(0))
  Tabelle_per_speech_reduced <- ecb_corpus[ecb_corpus$Central_Bank %in% CB,]
  DFM2 <- dfm_keep(dfm_ecb,pattern=b,valuetype = "fixed")
  DFM2 <- convert(DFM2,to="data.frame")
  if(ncol(DFM2)>2){TimeSeries <- rowSums(DFM2[,-1])}else{TimeSeries <- DFM2[,-1]}
  TimeSeries <- ifelse(TimeSeries>0,1,0)
  Tabelle_per_speech_reduced <- cbind(Year = Tabelle_per_speech_reduced$Year, Dokumente = rep(1,nrow(Tabelle_per_speech_reduced)), as.data.frame(TimeSeries))
  Tabelle_per_speech_reduced <- Tabelle_per_speech_reduced %>% group_by(Year) %>%  summarize_all(sum)
  a <- Tabelle_per_speech_reduced[,c("Year","Dokumente")]
  Tabelle_per_speech_reduced <- Tabelle_per_speech_reduced[,-1]/unlist(Tabelle_per_speech_reduced$Dokumente)
  Tabelle_per_speech_reduced$Year <- a$Year
  Tabelle_per_speech_reduced$Dokumente <- a$Dokumente
  Tabelle_per_speech_reduced <- gather(data = Tabelle_per_speech_reduced, key = Dokumente, value = value, -c(Year))
  Tabelle_per_speech_reduced <- Tabelle_per_speech_reduced[!Tabelle_per_speech_reduced$Dokumente %in% c("Dokumente"),]
  Tabelle_per_speech_reduced$Dokumente <- gsub("TimeSeries",CB,Tabelle_per_speech_reduced$Dokumente)
  Tabelle_per_speech <- rbind(Tabelle_per_speech,Tabelle_per_speech_reduced)
}
Tabelle_per_speech$Year <- as.integer(Tabelle_per_speech$Year)
Tabelle_per_speech <- Tabelle_per_speech[Tabelle_per_speech$Year>1998 & Tabelle_per_speech$Year < 2020,]
colnames(Tabelle_per_speech)[3] <- "Document proportion"
Tabelle_per_speech$`Document proportion`<-Tabelle_per_speech$`Document proportion`*100
Tabelle_per_speech <- Tabelle_per_speech[,c("Year","Document proportion")]

Tabelle2 <- merge(Tabelle,Tabelle_per_speech, by="Year",all = T)

#Convert to date variable
Tabelle2 <- Tabelle2 |> 
  mutate("Year" = parse_date_time(Year, "%Y"))

#Pivot longer
Tabelle2 <- Tabelle2 |> 
  rename("Percentage of speeches" = "Document proportion") |> 
  pivot_longer(`Relative term frequency`:`Percentage of speeches`,
               names_to = "measure", values_to = "value")
Tabelle2

# Plot
ggplot(Tabelle2, aes(x = Year, y = value)) +
  geom_smooth(formula = y~x,method = 'loess',color="black",se=F)+
  labs(x="Year", y="Frequency, 100=100%")+
  scale_x_datetime(expand = expansion(add = .6))+
  facet_wrap(vars(measure), scales = "free_y") +
  theme_bw(base_size = 15) 

